MLG Filter Statistics

Functions

See the source for details

Analysis of P. infestans

library('poppr')
## Loading required package: adegenet
## Loading required package: ade4
##    ==========================
##     adegenet 2.0.0 is loaded
##    ==========================
## 
##  - to start: type '?adegenet'
##  - to browse the adegenet website: type 'adegenetWeb()'
##  - to post questions/comments: adegenet-forum@lists.r-forge.r-project.org
##  - to report bugs, request features, contribute: http://goo.gl/dZuu5X
## 
## 
## This is poppr version 1.1.4.99.407. To get started, type package?poppr
library('ape')
library('phangorn')
library('animation')
infdat <- RCurl::getURL("https://raw.githubusercontent.com/grunwaldlab/phytophthora_id/master/shiny-server/www/genoid_infestans/reduced_database.txt.csv")
infdat <- read.table(text = infdat, header = TRUE)
pinf <- df2genind(infdat[-c(1,2)], sep = "/", ploidy = 3, ind.names = infdat[[1]], pop = infdat[[2]])
ssr <- c(3,3,2,3,3,2,2,3,3,3,3,3)
x <- as.genclone(pinf)

fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))

# # 
# tiff(filename = "images/pinf_cluster.tiff", width = 85, height = 68, units = "mm", res = 1200, pointsize = 6)
# fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
# title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))
# dev.off()

# legend("topright", legend = c("Nearest Neighbor", "UPGMA", "Farthest Neighbor"), 
#        col = c("red", "black", "blue"), pch = 1, title = "Clustering Method")

The plot above shows how multilocus genotypes collapse under differing algorithms over genetic distance.

Below, we will collapse MLGs with a threshold of an average of 2 mutational steps over all loci and create contingency tables relating the clustered MLGs to the previously defined MLGs (eg. US-8).

z <- filter_stats(x, bruvo.dist, replen = ssr, loss = FALSE, threshold = 0.75/12, stats = "MLGS")
print(table(pop(x), z$farthest), zero.print = ".")
##        
##         3 4 5 6 8 9 10 12 15 16 17 18 20 21 22 24 25 27 28
##   B     . . . . . .  .  .  .  .  .  .  .  .  .  .  1  .  .
##   C     . . . . . .  .  .  .  .  .  .  .  .  .  1  .  .  .
##   D.1   . . . . . .  .  .  .  .  .  .  .  .  1  .  .  .  .
##   D.2   . . . . . .  .  .  .  .  .  .  .  .  1  .  .  .  .
##   EU-13 . . . . . .  .  .  .  1  .  .  .  .  .  .  .  .  .
##   EU-4  . . . . . .  .  .  .  .  1  .  .  .  .  .  .  .  .
##   EU-5  . . . . . .  .  .  .  .  .  2  .  .  .  .  .  .  .
##   EU-8  . . . . . .  .  1  .  .  .  .  .  .  .  .  .  .  .
##   US-11 . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  2
##   US-12 . 1 . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-14 . . . . . 1  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-17 . . . . . .  .  .  .  .  .  .  1  .  .  .  .  .  .
##   US-20 2 . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-21 . . . . . .  .  .  .  .  .  .  .  .  .  .  .  2  .
##   US-22 . . . . . .  .  .  .  .  .  .  .  2  .  .  .  .  .
##   US-23 . . . . . .  .  .  3  .  .  .  .  .  .  .  .  .  .
##   US-24 . . . . 3 .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-8  . . 1 1 . .  2  .  .  .  .  .  .  .  .  .  .  .  .
print(table(pop(x), z$nearest), zero.print = ".")
##        
##         3 4 5 6 8 10 12 14 16 17 18 20 21 22 24 25 27 28
##   B     . . . . .  .  .  .  .  .  .  .  .  .  .  1  .  .
##   C     . . . . .  .  .  .  .  .  .  .  .  .  1  .  .  .
##   D.1   . . . . .  .  .  .  .  .  .  .  .  1  .  .  .  .
##   D.2   . . . . .  .  .  .  .  .  .  .  .  1  .  .  .  .
##   EU-13 . . . . .  .  .  .  1  .  .  .  .  .  .  .  .  .
##   EU-4  . . . . .  .  .  .  .  1  .  .  .  .  .  .  .  .
##   EU-5  . . . . .  .  .  .  .  .  2  .  .  .  .  .  .  .
##   EU-8  . . . . .  .  1  .  .  .  .  .  .  .  .  .  .  .
##   US-11 . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  2
##   US-12 . 1 . . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-14 . . . . .  1  .  .  .  .  .  .  .  .  .  .  .  .
##   US-17 . . . . .  .  .  .  .  .  .  1  .  .  .  .  .  .
##   US-20 2 . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-21 . . . . .  .  .  .  .  .  .  .  .  .  .  .  2  .
##   US-22 . . . . .  .  .  .  .  .  .  .  2  .  .  .  .  .
##   US-23 . . . . .  .  .  3  .  .  .  .  .  .  .  .  .  .
##   US-24 . . . . 3  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-8  . . 1 1 .  2  .  .  .  .  .  .  .  .  .  .  .  .
tab <- mlg.table(x, bar = FALSE)
colnames(tab) <- 1:ncol(tab)
print.table(tab, zero.print = ".") # contingency table for zero tolerance MLGs.
##       1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## B     . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  1  .
## C     . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .
## D.1   . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .
## D.2   . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .
## EU-13 . . . . . . . . .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .  .
## EU-4  . . . . . . . . .  .  .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .
## EU-5  . . . . . . . . .  .  .  .  .  .  .  .  .  1  1  .  .  .  .  .  .  .
## EU-8  . . . . . . . . .  .  .  1  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-11 . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-12 . . . 1 . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-14 . . . . . . . . 1  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-17 . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .  .
## US-20 . 1 1 . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-21 . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  1
## US-22 1 . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .  .
## US-23 . . . . . . . . .  .  .  .  1  1  1  .  .  .  .  .  .  .  .  .  .  .
## US-24 . . . . . . 1 2 .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-8  . . . . 1 1 . . .  1  1  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       27 28 29
## B      .  .  .
## C      .  .  .
## D.1    .  .  .
## D.2    .  .  .
## EU-13  .  .  .
## EU-4   .  .  .
## EU-5   .  .  .
## EU-8   .  .  .
## US-11  .  1  1
## US-12  .  .  .
## US-14  .  .  .
## US-17  .  .  .
## US-20  .  .  .
## US-21  1  .  .
## US-22  .  .  .
## US-23  .  .  .
## US-24  .  .  .
## US-8   .  .  .

Most of the MLGs were able to be resolved. US-8 and D.1 and D.2 are not exactly resolved, but that is shown in the trees produced from the commented code below.

# Uncomment to regenerate plots
# 
# for (i in names(fstats)){
#   HTML_collapse(measure = i, x = x, treefun = "upgma", 
#                 distfun = "bruvo.dist", fstats = fstats, destdir = "images", 
#                 replen = ssr, loss = FALSE)  
#   HTML_collapse(measure = i, x = x, treefun = "nj", 
#                 distfun = "bruvo.dist", fstats = fstats, destdir = "images", 
#                 replen = ssr, loss = FALSE)  
# }

Simulated data

This will create 20 populations with 20 samples and 10k SNPs. Each population will have:

  • n = 40
  • 10,000 snps
  • an error rate of 0.1
  • 21% missing data
  • 20 generations of mating

It is important that I define what “mating” is here. When I talk about “mating”, I have a function that will sample with replacement parental pairs. These pairs will first have a crossover event before meiosis and then single chromosomes are randomly chosen from each parent and then randomly mutated to create the new offspring. Only one offspring per parent pair is created.

The functions that perform this are :

  • random_mate_gen()
    • random_mate()
      • mate()
        • crossover()
      • pop_mutator()

This is to account for the fact that when glSim simulates populations, the terminal branches are EXTREMELY long. This scheme seems to make the branches a bit more realistic.

In addition, half of these populations will have undergone one generation of clonal reproduction. Each sample has a unique 10 letter identification. Samples from clonal populations will have a number appended to the end. We will use the 10 letter identifier to detect clones.

set.seed(20150415)
x <- lapply(1:10, getSims, n = 40, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, na.perc = 0.21, clone = TRUE, n.cores = 4, mate_gen = 20)
## Loading required package: parallel
y <- lapply(1:10, getSims, n = 40, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, na.perc = 0.21, clone = FALSE, n.cores = 4, mate_gen = 20)
# x <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = TRUE, n.cores = 4)
# y <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = FALSE, n.cores = 4)

The samples are then pooled.

For analysis, \(\frac{1}{5}\)th of the pooled samples will be kept.

fstats gives the statistics from mlg.filter when the threshold is set to the maximum distance possible. Finding the largest difference between two threshold values in the upper 50% of the data serves as a rough prediction of the threshold at which clones should be collapsed.

z <- do.call("rbind", c(x, y))
z <- z[sample(nInd(z), nInd(z)/5)]

trueclones    <- duplicated(substr(indNames(z), start = 1, stop = 10))
fstats        <- filter_stats(z, bitwise.dist, plot = TRUE)
(the_threshold <- threshold_predictor(fstats$average$thresholds))
## [1] 0.077075
abline(v = the_threshold, lty = 2)

This predicted threshold is then used to compare the defined clones to the true clones as presented in a contingency table.

thresh <- duplicated(mlg.filter(z, distance = bitwise.dist, 
                                threshold = the_threshold, 
                                algorithm = "a"))
(threshtable <- table(thresh, trueclones))
##        trueclones
## thresh  FALSE TRUE
##   FALSE   146    0
##   TRUE      0   14

The tabulation is a power analysis of how many true and false positives there are when collapsing at the threshold that gives the same number of known clones/replicates.

Below is labelling a tree with known clones.

the_tree <- upgma(bitwise.dist(z))
clones <- substr(the_tree$tip.label[thresh], start = 1, stop = 10)
clones <- lapply(clones, grep, the_tree$tip.label)
edgelist <- length(which.edge(the_tree, the_tree$tip.label))
edgecols <- rep("black", edgelist)
for (i in clones){
  edgecols[which.edge(the_tree, the_tree$tip.label[i])] <- "red"
}
plot.phylo(the_tree, edge.color = edgecols, adj = 0, label.offset = 0.001)
axisPhylo(1)
title("Random sequences with 10,000 SNPs and a 0.1 error rate")

Making lots of simulations

For these, we will simulate 10,000 markers for populations of 200 samples each.

nreps <- 100
resarray <- array(data = integer(nreps*4), dim = c(2, 2, nreps), 
                  dimnames = c(dimnames(threshtable), NULL))
neararray <- resarray
fararray  <- resarray
avarray   <- resarray
samplist  <- lapply(1:nreps, function(x) list(samp = NULL, tree = NULL, 
                                              mlgs = NULL))
Sys.time()
## [1] "2015-05-04 16:04:46 PDT"
for (i in 1:nreps){
  set.seed(i) # setting seed for accuracy.
  snps <- 1e4
  samp1 <- getSims(n = 200, snps = snps, strucrat = 0.5, ploidy = 2,
                   err = 0.1, na.perc = 0.1, clone = TRUE, n.cores = 4,
                   k = 10, pop.freq = rep(0.1, 10), alpha = 0.25)
  samp2 <- getSims(n = 200, snps = snps, strucrat = 0.5, ploidy = 2,
                  err = 0.1, na.perc = 0.1, clone = FALSE, n.cores = 4,
                  k = 10, pop.freq = rep(0.1, 10), alpha = 0.25)
  samp <- do.call("rbind", c(samp1, samp2))
  samp@ploidy <- rep(2L, nInd(samp))
  samp <- samp[sample(nInd(samp), nInd(samp)/5)]
  trueclones <- duplicated(substr(indNames(samp), start = 1, stop = 10))
  fstats <- filter_stats(samp, bitwise.dist, plot = TRUE)
  title(paste("seed:", i, "n:", nInd(samp), "snps:", snps))
  the_threshold <- threshold_predictor(fstats$average$thresholds)
  the_distance  <- bitwise.dist(samp)
  z <- filter_stats(x = samp, distance = bitwise.dist, 
                    threshold = the_threshold, stats = "MLGs")
  abline(v = the_threshold, lty = 2)
  text(the_threshold, 0, 
       labels = paste("Threshold:", signif(the_threshold, 3)), 
       adj = 0)
  samplist[[i]]$samp <- samp
  samplist[[i]]$tree <- upgma(the_distance)
  samplist[[i]]$mlgs <- z
  
  athresh <- duplicated(z$average)
  nthresh <- duplicated(z$nearest)
  fthresh <- duplicated(z$farthest)
  
  avarray[, , i] <- table(athresh, trueclones)
  avarray[, , i] <- sweep(avarray[, , i], 2, colSums(avarray[, , i]), "/")

  neararray[, , i] <- table(nthresh, trueclones)
  neararray[, , i] <- sweep(neararray[, , i], 2, colSums(neararray[, , i]), "/")

  fararray[, , i] <- table(fthresh, trueclones)
  fararray[, , i] <- sweep(fararray[, , i], 2, colSums(fararray[, , i]), "/")
}

Results

systime
## [1] "2015-05-04 16:40:31 PDT"
# color_mlg_tree(samp, upgma(bitwise.dist(samp)), z$average)
# axisPhylo(1)

Now we get to see how well we did.

(ares <- apply(avarray, 1:2, mean))
##        trueclones
## thresh     FALSE     TRUE
##   FALSE 0.998875 0.018875
##   TRUE  0.001125 0.981125
(nres <- apply(neararray, 1:2, mean))
##        trueclones
## thresh     FALSE     TRUE
##   FALSE 0.998875 0.018875
##   TRUE  0.001125 0.981125
(fres <- apply(fararray, 1:2, mean))
##        trueclones
## thresh     FALSE     TRUE
##   FALSE 0.998875 0.018875
##   TRUE  0.001125 0.981125
True Positive % False Positive %
farthest 98.1 0.112
average 98.1 0.112
nearest 98.1 0.112

Session Info

options(width = 100)
devtools::session_info()
## Session info ---------------------------------------------------------------------------------------
##  setting  value                       
##  version  R version 3.2.0 (2015-04-16)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/Los_Angeles
## Packages -------------------------------------------------------------------------------------------
##  package      * version      date       source                                  
##  ade4           1.7-2        2015-04-14 CRAN (R 3.2.0)                          
##  adegenet       2.0.0        2015-05-04 Github (thibautjombart/adegenet@a230516)
##  animation      2.3          2014-07-25 CRAN (R 3.2.0)                          
##  ape            3.2          2014-12-05 CRAN (R 3.2.0)                          
##  assertthat   * 0.1          2013-12-06 CRAN (R 3.1.0)                          
##  bitops       * 1.0-6        2013-08-17 CRAN (R 3.1.0)                          
##  boot         * 1.3-16       2015-03-08 CRAN (R 3.2.0)                          
##  cluster      * 2.0.1        2015-01-31 CRAN (R 3.1.2)                          
##  coda         * 0.17-1       2015-03-03 CRAN (R 3.2.0)                          
##  codetools    * 0.2-11       2015-03-10 CRAN (R 3.2.0)                          
##  colorspace   * 1.2-6        2015-03-11 CRAN (R 3.2.0)                          
##  DBI          * 0.3.1        2014-09-24 CRAN (R 3.1.1)                          
##  deldir       * 0.1-9        2015-03-09 CRAN (R 3.2.0)                          
##  devtools     * 1.7.0        2015-01-17 CRAN (R 3.2.0)                          
##  digest       * 0.6.8        2014-12-31 CRAN (R 3.1.2)                          
##  dplyr        * 0.4.1        2015-01-14 CRAN (R 3.1.2)                          
##  evaluate     * 0.6          2015-04-13 CRAN (R 3.2.0)                          
##  formatR      * 1.1          2015-03-29 CRAN (R 3.2.0)                          
##  ggplot2      * 1.0.1        2015-03-17 CRAN (R 3.2.0)                          
##  gtable       * 0.1.2        2012-12-05 CRAN (R 3.1.0)                          
##  highr        * 0.4.1        2015-03-29 CRAN (R 3.2.0)                          
##  htmltools    * 0.2.6        2014-09-08 CRAN (R 3.1.1)                          
##  httpuv       * 1.3.2        2014-10-23 CRAN (R 3.1.2)                          
##  igraph       * 0.7.1        2014-04-22 CRAN (R 3.1.0)                          
##  knitr        * 1.10         2015-04-23 CRAN (R 3.2.0)                          
##  lattice      * 0.20-31      2015-03-30 CRAN (R 3.2.0)                          
##  LearnBayes   * 2.15         2014-05-29 CRAN (R 3.1.0)                          
##  magrittr     * 1.5          2014-11-22 CRAN (R 3.1.2)                          
##  MASS         * 7.3-40       2015-03-21 CRAN (R 3.2.0)                          
##  Matrix       * 1.2-0        2015-04-04 CRAN (R 3.2.0)                          
##  mgcv         * 1.8-6        2015-03-31 CRAN (R 3.2.0)                          
##  mime         * 0.3          2015-03-29 CRAN (R 3.1.3)                          
##  munsell      * 0.4.2        2013-07-11 CRAN (R 3.1.0)                          
##  nlme         * 3.1-120      2015-02-20 CRAN (R 3.2.0)                          
##  nnls         * 1.4          2012-03-19 CRAN (R 3.1.2)                          
##  pegas        * 0.7-0.4      2015-05-04 Github (emmanuelparadis/pegas@883d74f)  
##  permute      * 0.8-3        2014-01-29 CRAN (R 3.1.0)                          
##  phangorn       1.99-14      2015-05-04 Github (KlausVigo/phangorn@cf4aef0)     
##  plyr         * 1.8.1        2014-02-26 CRAN (R 3.1.0)                          
##  poppr          1.1.4.99-407 2015-05-04 Github (grunwaldlab/poppr@f99f2a7)      
##  proto        * 0.3-10       2012-12-22 CRAN (R 3.1.0)                          
##  R6           * 2.0.1        2014-10-29 CRAN (R 3.1.2)                          
##  RColorBrewer * 1.1-2        2014-12-07 CRAN (R 3.1.2)                          
##  Rcpp         * 0.11.5       2015-03-06 CRAN (R 3.2.0)                          
##  RCurl        * 1.95-4.5     2014-12-28 CRAN (R 3.1.2)                          
##  reshape2     * 1.4.1        2014-12-06 CRAN (R 3.1.2)                          
##  rmarkdown    * 0.5.1        2015-01-26 CRAN (R 3.2.0)                          
##  rstudioapi   * 0.3.1        2015-04-07 CRAN (R 3.2.0)                          
##  scales       * 0.2.4        2014-04-22 CRAN (R 3.1.0)                          
##  seqinr       * 3.1-3        2014-12-17 CRAN (R 3.1.2)                          
##  shiny        * 0.11.1.9004  2015-04-16 Github (rstudio/shiny@02caf05)          
##  sp           * 1.0-17       2015-01-08 CRAN (R 3.1.2)                          
##  spdep        * 0.5-88       2015-03-17 CRAN (R 3.2.0)                          
##  stringr      * 0.6.2        2012-12-06 CRAN (R 3.1.0)                          
##  vegan        * 2.2-1        2015-01-12 CRAN (R 3.1.2)                          
##  xtable       * 1.7-4        2014-09-12 CRAN (R 3.1.1)